Temporary script for predicting Flagellate production rate¶
Importing¶
In [ ]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from skfda.representation.grid import FDataGrid
from skfda.ml.clustering import KMeans
import xgboost as xgb
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import root_mean_squared_error as rmse
from tqdm import tqdm
import dill
import random
import salishsea_tools.viz_tools as sa_vi
Datasets Preparation¶
In [ ]:
def datasets_preparation(dataset, dataset2, clusters, name):
drivers = np.stack([np.ravel(dataset['Temperature_(0m-15m)']),
np.ravel(dataset['Temperature_(15m-100m)']),
np.ravel(dataset['Salinity_(0m-15m)']),
np.ravel(dataset['Salinity_(15m-100m)']),
np.ravel(dataset2['Summation_of_solar_radiation']),
np.ravel(dataset2['Mean_wind_speed']),
np.ravel(dataset2['Mean_air_temperature']),
np.repeat(dataset.time_counter.dt.dayofyear, len(dataset.x)*len(dataset.y))
])
x = np.tile(dataset.x, len(dataset.time_counter)*len(dataset.y))
y = np.tile(np.repeat(dataset.y, len(dataset.x)), len(dataset.time_counter))
indx = np.where(~np.isnan(drivers).any(axis=0) & (x>10) & ((x>100) | (y<880)))
drivers = drivers[:,indx[0]]
phyto = np.ravel(dataset[name])
phyto = phyto[indx[0]]
clusters = np.tile(np.ravel(clusters), len(dataset.time_counter))
clusters = clusters[indx[0]]
drivers = drivers.transpose()
return(drivers, phyto, indx, clusters)
Functional Clustering (target)¶
In [ ]:
def func_clust_target(dataset, name):
input = np.reshape(np.ravel(dataset[name]), (len(dataset.time_counter), len(dataset.y) * len(dataset.x)))
x = np.tile(dataset.x, len(dataset.y))
y = np.tile(np.repeat(dataset.y, len(dataset.x)),1)
indx = np.where((~np.isnan(input).any(axis=0)) & (x>10) & ((x>100) | (y<880)))
input = input[:, indx[0]]
input = input.transpose()
input = minmax_scale(input)
# Converting it to an appropriate format for functional clustering
input2 = FDataGrid(input)
# Training
n_clusters = 6
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(input2)
clusters = kmeans.predict(input2)
# Creating the map
indx2 = np.full(len(dataset.y) * len(dataset.x),np.nan)
indx2[indx[0]] = clusters
clusters = np.reshape(indx2,(len(dataset.y),len(dataset.x)))
# Preparation of the dataarray
clusters2 = xr.DataArray(clusters,
coords = {'y': dataset.y, 'x': dataset.x},
dims = ['y','x'],
attrs=dict(description="Clusters of the performed functional analysis algorithm",
long_name ="Cluster",
units="count"),)
# Plotting
clusters2.plot()
plt.title('Functional Clustering for '+ name)
return(clusters)
Functional Clustering (drivers)¶
In [ ]:
def func_clust_drivers(dataset, dataset2, name):
input = np.stack([np.reshape(np.ravel(dataset['Temperature_(0m-15m)']), (len(dataset.time_counter), len(dataset.y) * len(dataset.x))),
np.reshape(np.ravel(dataset['Temperature_(15m-100m)']), (len(dataset.time_counter), len(dataset.y) * len(dataset.x))),
np.reshape(np.ravel(dataset['Salinity_(0m-15m)']), (len(dataset.time_counter), len(dataset.y) * len(dataset.x))),
np.reshape(np.ravel(dataset['Salinity_(15m-100m)']), (len(dataset.time_counter), len(dataset.y) * len(dataset.x))),
np.reshape(np.ravel(dataset2['Summation_of_solar_radiation']), (len(dataset.time_counter), len(dataset.y) * len(dataset.x))),
np.reshape(np.ravel(dataset2['Mean_wind_speed']), (len(dataset.time_counter), len(dataset.y) * len(dataset.x))),
np.reshape(np.ravel(dataset2['Mean_air_temperature']), (len(dataset.time_counter), len(dataset.y) * len(dataset.x))),
])
x = np.tile(dataset.x, len(dataset.y))
y = np.tile(np.repeat(dataset.y, len(dataset.x)),1)
indx = np.where((~np.isnan(input[1]).any(axis=0)) & (x>10) & ((x>100) | (y<880))) # input[1] because this variable is down to 100m
input =input[:,:,indx[0]]
input = np.transpose(input,axes=(0,2,1)) # this is the right shape for preprocessing the data
# Transforming each variable individually
input[0] = minmax_scale(input[0])
input[1] = minmax_scale(input[1])
input[2] = minmax_scale(input[2])
input[3] = minmax_scale(input[3])
input[4] = minmax_scale(input[4])
input[5] = minmax_scale(input[5])
input[6] = minmax_scale(input[6])
# Converting it to an appropriate format for functional clustering
input = np.transpose(input,axes=(1,2,0)) # this is the right shape for converting it to a functional variable
input2 = FDataGrid(input, np.arange(0,len(dataset.time_counter)))
# Training
n_clusters = 6
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(input2)
clusters = kmeans.predict(input2)
# Creating the map
indx2 = np.full(len(dataset.y) * len(dataset.x),np.nan)
indx2[indx[0]] = clusters
clusters = np.reshape(indx2,(len(dataset.y),len(dataset.x)))
# Preparation of the dataarray
clusters2 = xr.DataArray(clusters,
coords = {'y': dataset.y, 'x': dataset.x},
dims = ['y','x'],
attrs=dict(description="Clusters of the performed functional analysis algorithm",
long_name ="Cluster",
units="count"),)
# Plotting
clusters2.plot()
plt.title('Functional Clustering for drivers')
return(clusters)
Regressor (Training)¶
In [ ]:
def regressor (inputs, targets, clusters):
model = TransformedTargetRegressor(regressor=make_pipeline(ColumnTransformer(
transformers=[('temporal', OneHotEncoder(), [7])],remainder=MinMaxScaler()),
xgb.XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)),
transformer=MinMaxScaler())
regr_all = []
for i in range (0,len(np.unique(clusters))):
indx2 = np.where(clusters==i) # indexes of the j cluster
inputs2 = inputs[indx2[0]] # inputs of the j cluster
targets2 = targets[indx2]
regr = BaggingRegressor(model, n_estimators=12, n_jobs=4).fit(inputs2,targets2)
regr_all.append(regr)
return(regr_all)
Regressor (Other Years (Anually))¶
In [ ]:
def regressor2 (inputs, targets, clusters):
# Outputs for each regressor
outputs = []
outputs_all = np.full(len(targets),np.nan) # size of a year without nans
for i in range (0,len(np.unique(clusters))):
indx2 = np.where(clusters==i) # indexes of the j cluster
inputs2 = inputs[indx2[0]] # inputs of the j cluster
outputs.append(regr_all[i].predict(inputs2))
outputs_all[indx2] = outputs[i] # putting them in the right place
m, _ = np.polyfit(targets, outputs_all, deg=1)
r = np.round(np.corrcoef(targets, outputs_all)[0][1],3)
rms = rmse(targets, outputs_all)
return (r, rms, m)
Regressor (Other Years (Daily))¶
In [ ]:
def regressor3 (inputs, targets, clusters):
# Outputs for each regressor
outputs = []
outputs_all = np.full(len(targets),np.nan) # size of a day without nans
for j in range (0,len(np.unique(clusters))):
indx2 = np.where(clusters==j) # indexes of the j cluster
inputs2 = inputs[indx2[0]] # inputs of the j cluster
outputs.append(regr_all[j].predict(inputs2))
outputs_all[indx2] = outputs[j] # putting them in the right place
ave_model = np.mean(outputs_all)
m, _ = np.polyfit(targets, outputs_all, deg=1)
r = np.round(np.corrcoef(targets, outputs_all)[0][1],3)
rms = rmse(targets, outputs_all)
return (r, rms, m, ave_model)
Regressor (Daily Maps)¶
In [ ]:
def regressor4 (inputs, targets, indx, targets_i, clusters, name):
# Outputs for each regressor
outputs = []
outputs_all = np.full(len(targets),np.nan) # size of a day without nans
for j in range (0,len(np.unique(clusters))):
indx2 = np.where(clusters==j) # indexes of the j cluster
inputs2 = inputs[indx2[0]] # inputs of the j cluster
outputs.append(regr_all[j].predict(inputs2))
outputs_all[indx2] = outputs[j] # putting them in the right place
m = scatter_plot(targets, outputs_all, name +' ' + str(i.dt.date.values))
# Creating a variable with nans
model = np.full((len(targets_i.y)*len(targets_i.x)),np.nan) # size of a day without nans
model [indx[0]]= outputs_all # putting them in the right place
model = np.reshape(model,(len(targets_i.y),len(targets_i.x)))
# Preparation of the dataarray
model = xr.DataArray(model,
coords = {'y': targets_i.y, 'x': targets_i.x},
dims = ['y','x'],
attrs=dict( long_name = name + "Concentration",
units="mmol m-2"),)
plotting3(targets, model, targets_i, name)
Printing¶
In [ ]:
def printing (targets, outputs, m):
print ('The amount of data points is', outputs.size)
print ('The slope of the best fitting line is ', np.round(m,3))
print ('The correlation coefficient is:', np.round(np.corrcoef(targets, outputs)[0][1],3))
print ('The root mean square error is:', rmse(targets,outputs))
Scatter Plot¶
In [ ]:
def scatter_plot(targets, outputs, variable_name):
# compute slope m and intercept b
m, b = np.polyfit(targets, outputs, deg=1)
printing(targets, outputs, m)
fig, ax = plt.subplots(2, figsize=(5,10), layout='constrained')
ax[0].scatter(targets,outputs, alpha = 0.2, s = 10)
lims = [np.min([ax[0].get_xlim(), ax[0].get_ylim()]),
np.max([ax[0].get_xlim(), ax[0].get_ylim()])]
# plot fitted y = m*x + b
ax[0].axline(xy1=(0, b), slope=m, color='r')
ax[0].set_xlabel('targets')
ax[0].set_ylabel('outputs')
ax[0].set_xlim(lims)
ax[0].set_ylim(lims)
ax[0].set_aspect('equal')
ax[0].plot(lims, lims,linestyle = '--',color = 'k')
h = ax[1].hist2d(targets,outputs, bins=100, cmap='jet',
range=[lims,lims], cmin=0.1, norm='log')
ax[1].plot(lims, lims,linestyle = '--',color = 'k')
# plot fitted y = m*x + b
ax[1].axline(xy1=(0, b), slope=m, color='r')
ax[1].set_xlabel('targets')
ax[1].set_ylabel('outputs')
ax[1].set_aspect('equal')
fig.colorbar(h[3],ax=ax[1], location='bottom')
fig.suptitle(variable_name)
plt.show()
return (m)
Plotting (Other Years (Annually))¶
In [ ]:
def plotting(variable, name):
plt.plot(years,variable, marker = '.', linestyle = '')
plt.xlabel('Years')
plt.ylabel(name)
plt.show()
Plotting (Other Years (Daily))¶
In [ ]:
def plotting2(variable,title):
fig, ax = plt.subplots()
scatter= ax.scatter(dates,variable, marker='.', c=pd.DatetimeIndex(dates).month)
ax.legend(handles=scatter.legend_elements()[0], labels=['February','March','April'])
fig.suptitle('Daily ' + title + ' (15 Feb - 30 Apr)')
fig.show()
Plotting (Daily Maps)¶
In [ ]:
def plotting3(targets, model, targets_i, name):
fig, ax = plt.subplots(2,2, figsize = (10,15))
cmap = plt.get_cmap('cubehelix')
cmap.set_bad('gray')
targets_i.plot(ax=ax[0,0], cmap=cmap, vmin = targets.min(), vmax =targets.max(), cbar_kwargs={'label': name + ' Concentration [mmol m-2]'})
model.plot(ax=ax[0,1], cmap=cmap, vmin = targets.min(), vmax = targets.max(), cbar_kwargs={'label': name + ' Concentration [mmol m-2]'})
((targets_i-model) / targets_i * 100).plot(ax=ax[1,0], cmap=cmap, cbar_kwargs={'label': name + ' Concentration [percentage]'})
plt.subplots_adjust(left=0.1,
bottom=0.1,
right=0.95,
top=0.95,
wspace=0.35,
hspace=0.35)
sa_vi.set_aspect(ax[0,0])
sa_vi.set_aspect(ax[0,1])
sa_vi.set_aspect(ax[1,0])
ax[0,0].title.set_text(name + ' (targets)')
ax[0,1].title.set_text(name + ' (outputs)')
ax[1,0].title.set_text('targets - outputs')
ax[1,1].axis('off')
fig.suptitle(str(i.dt.date.values))
plt.show()
Training¶
In [ ]:
name = 'Flagellate_Production_Rate'
ds = xr.open_dataset('/data/ibougoudis/MOAD/files/integrated_original.nc')
ds2 = xr.open_dataset('/data/ibougoudis/MOAD/files/external_inputs.nc')
# ds = ds.isel(time_counter = (np.arange(0, len(ds.time_counter),2)),
# y=(np.arange(ds.y[0], ds.y[-1], 5)),
# x=(np.arange(ds.x[0], ds.x[-1], 5)))
# ds2 = ds2.isel(time_counter = (np.arange(0, len(ds2.time_counter),2)),
# y=(np.arange(ds2.y[0], ds2.y[-1], 5)),
# x=(np.arange(ds2.x[0], ds2.x[-1], 5)))
dataset = ds.sel(time_counter = slice('2007', '2020'))
dataset2 = ds2.sel(time_counter = slice('2007', '2020'))
# Can choose the inputs of the functional clustering map - drivers or target
clusters0 = func_clust_drivers(dataset, dataset2, name)
inputs, targets, _, clusters = datasets_preparation(dataset, dataset2, clusters0, name)
regr_all = regressor(inputs, targets, clusters)
ds = ds.sel(time_counter = slice('2021', '2024'))
ds2 = ds2.sel(time_counter = slice('2021', '2024'))
dates = pd.DatetimeIndex(ds['time_counter'].values)
Other Years (Anually)¶
In [ ]:
years = range (2021,2025)
r_all = np.array([])
rms_all = np.array([])
slope_all = np.array([])
for i in tqdm(years):
dataset = ds.sel(time_counter=str(i))
dataset2 = ds2.sel(time_counter=str(i))
inputs, targets, _, clusters = datasets_preparation(dataset,dataset2, clusters0, name)
r, rms, m = regressor2(inputs, targets, clusters)
r_all = np.append(r_all,r)
rms_all = np.append(rms_all,rms)
slope_all = np.append(slope_all,m)
plotting(r_all, name)
plotting(rms_all, name)
plotting(slope_all, name)
100%|██████████| 4/4 [18:49<00:00, 282.26s/it]
Other Years (Daily)¶
In [ ]:
r_all = np.array([])
rms_all = np.array([])
slope_all = np.array([])
mean_meas = np.array([])
mean_model = np.array([])
for i in tqdm(ds.time_counter):
dataset = ds.sel(time_counter=slice(i,i))
dataset2 = ds2.sel(time_counter=slice(i,i))
inputs, targets, indx, clusters = datasets_preparation(dataset, dataset2, clusters0, name)
ave_meas = np.mean(targets)
r, rms, m, ave_model = regressor3(inputs, targets, clusters)
r_all = np.append(r_all,r)
rms_all = np.append(rms_all,rms)
slope_all = np.append(slope_all,m)
mean_meas = np.append(mean_meas,ave_meas)
mean_model = np.append(mean_model,ave_model)
plotting2(r_all, 'Correlation Coefficients')
plotting2(rms_all, 'Root Mean Square Errors')
plotting2(slope_all, 'Slope of the best fitting line')
100%|██████████| 301/301 [1:37:55<00:00, 19.52s/it]
Daily Maps¶
In [ ]:
maps = random.sample(sorted(ds.time_counter),10)
for i in tqdm(maps):
dataset = ds.sel(time_counter=slice(i,i))
dataset2 = ds2.sel(time_counter=slice(i,i))
inputs, targets, indx, clusters = datasets_preparation(dataset, dataset2, clusters0, name)
targets_i = dataset[name]
regressor4(inputs, targets, indx, targets_i, clusters, name)
0%| | 0/10 [00:00<?, ?it/s]
The amount of data points is 45897 The slope of the best fitting line is 0.432 The correlation coefficient is: 0.688 The root mean square error is: 2.3749528412432296e-07
10%|█ | 1/10 [00:21<03:15, 21.72s/it]
The amount of data points is 45897 The slope of the best fitting line is 0.645 The correlation coefficient is: 0.935 The root mean square error is: 1.91441602444467e-07
20%|██ | 2/10 [00:42<02:51, 21.42s/it]
The amount of data points is 45897 The slope of the best fitting line is 0.618 The correlation coefficient is: 0.886 The root mean square error is: 1.1824162249367585e-07
30%|███ | 3/10 [01:04<02:31, 21.67s/it]
The amount of data points is 45897 The slope of the best fitting line is 0.548 The correlation coefficient is: 0.801 The root mean square error is: 1.4978081528117846e-07
40%|████ | 4/10 [01:26<02:09, 21.63s/it]
The amount of data points is 45897 The slope of the best fitting line is 0.436 The correlation coefficient is: 0.744 The root mean square error is: 2.1171518223346056e-07
50%|█████ | 5/10 [01:47<01:46, 21.37s/it]
The amount of data points is 45897 The slope of the best fitting line is 0.593 The correlation coefficient is: 0.849 The root mean square error is: 1.9906411092411423e-07
60%|██████ | 6/10 [02:08<01:25, 21.28s/it]
The amount of data points is 45897 The slope of the best fitting line is 0.453 The correlation coefficient is: 0.818 The root mean square error is: 1.7288292649545673e-07
70%|███████ | 7/10 [02:30<01:04, 21.46s/it]
The amount of data points is 45897 The slope of the best fitting line is 0.725 The correlation coefficient is: 0.74 The root mean square error is: 2.2370038367661493e-07
80%|████████ | 8/10 [02:52<00:43, 21.60s/it]
The amount of data points is 45897 The slope of the best fitting line is 0.471 The correlation coefficient is: 0.572 The root mean square error is: 2.0424793493914724e-07
90%|█████████ | 9/10 [03:14<00:21, 21.79s/it]
The amount of data points is 45897 The slope of the best fitting line is 0.732 The correlation coefficient is: 0.809 The root mean square error is: 1.25414085604773e-07
100%|██████████| 10/10 [03:36<00:00, 21.66s/it] 100%|██████████| 10/10 [03:36<00:00, 21.66s/it]
Daily Time-series¶
In [ ]:
plt.plot(dates,(mean_meas), marker = '.', linestyle = '', label = 'targets')
plt.plot(dates,(mean_model), marker = '.', linestyle = '', label = 'outputs')
plt.xlabel('Years')
plt.ylabel(name + ' Concentration [mmol m-2]')
plt.suptitle('Daily Time-series')
plt.legend()
plt.show()
In [ ]: